{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Tutorial 3: SQL data source"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<img src=\"../../imgs/lightautoml_logo_color.png\" alt=\"LightAutoML logo\" style=\"width:100%;\"/>"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Official LightAutoML github repository is [here](https://github.com/AILab-MLTools/LightAutoML)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preparing"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 1. Install LightAutoML "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Uncomment if doesn't clone repository by git. (ex.: colab, kaggle version)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#! pip install -U lightautoml"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 2. Import necessary libraries "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Standard python libraries\n",
    "import os\n",
    "import time\n",
    "import requests\n",
    "# Installed libraries\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.metrics import roc_auc_score\n",
    "from sklearn.model_selection import train_test_split\n",
    "import torch\n",
    "\n",
    "# Imports from our package\n",
    "import gensim\n",
    "from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML\n",
    "from lightautoml.dataset.roles import DatetimeRole\n",
    "from lightautoml.tasks import Task"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 3. Parameters "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "N_THREADS = 8 # threads cnt for lgbm and linear models\n",
    "N_FOLDS = 5 # folds cnt for AutoML\n",
    "RANDOM_STATE = 42 # fixed random state for various reasons\n",
    "TEST_SIZE = 0.2 # Test size for metric check\n",
    "TIMEOUT = 300 # Time in seconds for automl run\n",
    "TARGET_NAME = 'TARGET' # Target column name"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 4. Fix torch number of threads and numpy seed "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(RANDOM_STATE)\n",
    "torch.set_num_threads(N_THREADS)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 5. Example data load "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load a dataset from the repository if doesn't clone repository by git."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "DATASET_DIR = '../data/'\n",
    "DATASET_NAME = 'sampled_app_train.csv'\n",
    "DATASET_FULLNAME = os.path.join(DATASET_DIR, DATASET_NAME)\n",
    "DATASET_URL = 'https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/master/examples/data/sampled_app_train.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 29 µs, sys: 20 µs, total: 49 µs\n",
      "Wall time: 68.4 µs\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "if not os.path.exists(DATASET_FULLNAME):\n",
    "    os.makedirs(DATASET_DIR, exist_ok=True)\n",
    "\n",
    "    dataset = requests.get(DATASET_URL).text\n",
    "    with open(DATASET_FULLNAME, 'w') as output:\n",
    "        output.write(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 104 ms, sys: 19.8 ms, total: 123 ms\n",
      "Wall time: 122 ms\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SK_ID_CURR</th>\n",
       "      <th>TARGET</th>\n",
       "      <th>NAME_CONTRACT_TYPE</th>\n",
       "      <th>CODE_GENDER</th>\n",
       "      <th>FLAG_OWN_CAR</th>\n",
       "      <th>FLAG_OWN_REALTY</th>\n",
       "      <th>CNT_CHILDREN</th>\n",
       "      <th>AMT_INCOME_TOTAL</th>\n",
       "      <th>AMT_CREDIT</th>\n",
       "      <th>AMT_ANNUITY</th>\n",
       "      <th>...</th>\n",
       "      <th>FLAG_DOCUMENT_18</th>\n",
       "      <th>FLAG_DOCUMENT_19</th>\n",
       "      <th>FLAG_DOCUMENT_20</th>\n",
       "      <th>FLAG_DOCUMENT_21</th>\n",
       "      <th>AMT_REQ_CREDIT_BUREAU_HOUR</th>\n",
       "      <th>AMT_REQ_CREDIT_BUREAU_DAY</th>\n",
       "      <th>AMT_REQ_CREDIT_BUREAU_WEEK</th>\n",
       "      <th>AMT_REQ_CREDIT_BUREAU_MON</th>\n",
       "      <th>AMT_REQ_CREDIT_BUREAU_QRT</th>\n",
       "      <th>AMT_REQ_CREDIT_BUREAU_YEAR</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>313802</td>\n",
       "      <td>0</td>\n",
       "      <td>Cash loans</td>\n",
       "      <td>M</td>\n",
       "      <td>N</td>\n",
       "      <td>Y</td>\n",
       "      <td>0</td>\n",
       "      <td>270000.0</td>\n",
       "      <td>327024.0</td>\n",
       "      <td>15372.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>319656</td>\n",
       "      <td>0</td>\n",
       "      <td>Cash loans</td>\n",
       "      <td>F</td>\n",
       "      <td>N</td>\n",
       "      <td>N</td>\n",
       "      <td>0</td>\n",
       "      <td>108000.0</td>\n",
       "      <td>675000.0</td>\n",
       "      <td>19737.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>207678</td>\n",
       "      <td>0</td>\n",
       "      <td>Revolving loans</td>\n",
       "      <td>F</td>\n",
       "      <td>Y</td>\n",
       "      <td>Y</td>\n",
       "      <td>2</td>\n",
       "      <td>112500.0</td>\n",
       "      <td>270000.0</td>\n",
       "      <td>13500.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>381593</td>\n",
       "      <td>0</td>\n",
       "      <td>Cash loans</td>\n",
       "      <td>F</td>\n",
       "      <td>N</td>\n",
       "      <td>N</td>\n",
       "      <td>1</td>\n",
       "      <td>67500.0</td>\n",
       "      <td>142200.0</td>\n",
       "      <td>9630.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>258153</td>\n",
       "      <td>0</td>\n",
       "      <td>Cash loans</td>\n",
       "      <td>F</td>\n",
       "      <td>Y</td>\n",
       "      <td>Y</td>\n",
       "      <td>0</td>\n",
       "      <td>337500.0</td>\n",
       "      <td>1483231.5</td>\n",
       "      <td>46570.5</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 122 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   SK_ID_CURR  TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR  \\\n",
       "0      313802       0         Cash loans           M            N   \n",
       "1      319656       0         Cash loans           F            N   \n",
       "2      207678       0    Revolving loans           F            Y   \n",
       "3      381593       0         Cash loans           F            N   \n",
       "4      258153       0         Cash loans           F            Y   \n",
       "\n",
       "  FLAG_OWN_REALTY  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \\\n",
       "0               Y             0          270000.0    327024.0      15372.0   \n",
       "1               N             0          108000.0    675000.0      19737.0   \n",
       "2               Y             2          112500.0    270000.0      13500.0   \n",
       "3               N             1           67500.0    142200.0       9630.0   \n",
       "4               Y             0          337500.0   1483231.5      46570.5   \n",
       "\n",
       "   ...  FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21  \\\n",
       "0  ...                 0                0                0                0   \n",
       "1  ...                 0                0                0                0   \n",
       "2  ...                 0                0                0                0   \n",
       "3  ...                 0                0                0                0   \n",
       "4  ...                 0                0                0                0   \n",
       "\n",
       "  AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY  \\\n",
       "0                        0.0                       0.0   \n",
       "1                        0.0                       0.0   \n",
       "2                        0.0                       0.0   \n",
       "3                        0.0                       0.0   \n",
       "4                        0.0                       0.0   \n",
       "\n",
       "   AMT_REQ_CREDIT_BUREAU_WEEK  AMT_REQ_CREDIT_BUREAU_MON  \\\n",
       "0                         0.0                        0.0   \n",
       "1                         0.0                        0.0   \n",
       "2                         0.0                        0.0   \n",
       "3                         0.0                        0.0   \n",
       "4                         0.0                        2.0   \n",
       "\n",
       "   AMT_REQ_CREDIT_BUREAU_QRT  AMT_REQ_CREDIT_BUREAU_YEAR  \n",
       "0                        0.0                         1.0  \n",
       "1                        0.0                         0.0  \n",
       "2                        0.0                         1.0  \n",
       "3                        0.0                         4.0  \n",
       "4                        0.0                         0.0  \n",
       "\n",
       "[5 rows x 122 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "data = pd.read_csv(DATASET_FULLNAME)\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 6. (Optional) Some user feature preparation "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Cell below shows some user feature preparations to create task more difficult (this block can be omitted if you don't want to change the initial data):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 105 ms, sys: 8.82 ms, total: 114 ms\n",
      "Wall time: 112 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)\n",
    "data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))\n",
    "                    ).astype(str)\n",
    "\n",
    "data['constant'] = 1\n",
    "data['allnan'] = np.nan\n",
    "\n",
    "data['report_dt'] = np.datetime64('2018-01-01')\n",
    "\n",
    "data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 7. (Optional) Data splitting for train-test "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Block below can be omitted if you are going to train model only or you have specific train and test files:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data splitted. Parts sizes: train_data = (8000, 125), test_data = (2000, 125)\n",
      "CPU times: user 11.2 ms, sys: 0 ns, total: 11.2 ms\n",
      "Wall time: 9.95 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "train_data, test_data = train_test_split(data, \n",
    "                                         test_size=TEST_SIZE, \n",
    "                                         stratify=data[TARGET_NAME], \n",
    "                                         random_state=RANDOM_STATE)\n",
    "print('Data splitted. Parts sizes: train_data = {}, test_data = {}'\n",
    "              .format(train_data.shape, test_data.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SK_ID_CURR</th>\n",
       "      <th>TARGET</th>\n",
       "      <th>NAME_CONTRACT_TYPE</th>\n",
       "      <th>CODE_GENDER</th>\n",
       "      <th>FLAG_OWN_CAR</th>\n",
       "      <th>FLAG_OWN_REALTY</th>\n",
       "      <th>CNT_CHILDREN</th>\n",
       "      <th>AMT_INCOME_TOTAL</th>\n",
       "      <th>AMT_CREDIT</th>\n",
       "      <th>AMT_ANNUITY</th>\n",
       "      <th>...</th>\n",
       "      <th>AMT_REQ_CREDIT_BUREAU_DAY</th>\n",
       "      <th>AMT_REQ_CREDIT_BUREAU_WEEK</th>\n",
       "      <th>AMT_REQ_CREDIT_BUREAU_MON</th>\n",
       "      <th>AMT_REQ_CREDIT_BUREAU_QRT</th>\n",
       "      <th>AMT_REQ_CREDIT_BUREAU_YEAR</th>\n",
       "      <th>BIRTH_DATE</th>\n",
       "      <th>EMP_DATE</th>\n",
       "      <th>constant</th>\n",
       "      <th>allnan</th>\n",
       "      <th>report_dt</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>6444</th>\n",
       "      <td>112261</td>\n",
       "      <td>0</td>\n",
       "      <td>Cash loans</td>\n",
       "      <td>F</td>\n",
       "      <td>N</td>\n",
       "      <td>N</td>\n",
       "      <td>1</td>\n",
       "      <td>90000.0</td>\n",
       "      <td>640080.0</td>\n",
       "      <td>31261.5</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1985-06-28</td>\n",
       "      <td>2012-06-21</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2018-01-01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3586</th>\n",
       "      <td>115058</td>\n",
       "      <td>0</td>\n",
       "      <td>Cash loans</td>\n",
       "      <td>F</td>\n",
       "      <td>N</td>\n",
       "      <td>Y</td>\n",
       "      <td>0</td>\n",
       "      <td>180000.0</td>\n",
       "      <td>239850.0</td>\n",
       "      <td>23850.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1953-12-27</td>\n",
       "      <td>2018-01-01</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2018-01-01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9349</th>\n",
       "      <td>326623</td>\n",
       "      <td>0</td>\n",
       "      <td>Cash loans</td>\n",
       "      <td>F</td>\n",
       "      <td>N</td>\n",
       "      <td>Y</td>\n",
       "      <td>0</td>\n",
       "      <td>112500.0</td>\n",
       "      <td>337500.0</td>\n",
       "      <td>31086.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1975-06-21</td>\n",
       "      <td>2016-06-17</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2018-01-01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7734</th>\n",
       "      <td>191976</td>\n",
       "      <td>0</td>\n",
       "      <td>Cash loans</td>\n",
       "      <td>M</td>\n",
       "      <td>Y</td>\n",
       "      <td>Y</td>\n",
       "      <td>1</td>\n",
       "      <td>67500.0</td>\n",
       "      <td>135000.0</td>\n",
       "      <td>9018.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1988-04-27</td>\n",
       "      <td>2009-06-05</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2018-01-01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2174</th>\n",
       "      <td>281519</td>\n",
       "      <td>0</td>\n",
       "      <td>Revolving loans</td>\n",
       "      <td>F</td>\n",
       "      <td>N</td>\n",
       "      <td>Y</td>\n",
       "      <td>0</td>\n",
       "      <td>67500.0</td>\n",
       "      <td>202500.0</td>\n",
       "      <td>10125.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1975-06-13</td>\n",
       "      <td>1997-01-22</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2018-01-01</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 125 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      SK_ID_CURR  TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR  \\\n",
       "6444      112261       0         Cash loans           F            N   \n",
       "3586      115058       0         Cash loans           F            N   \n",
       "9349      326623       0         Cash loans           F            N   \n",
       "7734      191976       0         Cash loans           M            Y   \n",
       "2174      281519       0    Revolving loans           F            N   \n",
       "\n",
       "     FLAG_OWN_REALTY  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \\\n",
       "6444               N             1           90000.0    640080.0      31261.5   \n",
       "3586               Y             0          180000.0    239850.0      23850.0   \n",
       "9349               Y             0          112500.0    337500.0      31086.0   \n",
       "7734               Y             1           67500.0    135000.0       9018.0   \n",
       "2174               Y             0           67500.0    202500.0      10125.0   \n",
       "\n",
       "      ...  AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK  \\\n",
       "6444  ...                        0.0                        0.0   \n",
       "3586  ...                        0.0                        0.0   \n",
       "9349  ...                        0.0                        0.0   \n",
       "7734  ...                        NaN                        NaN   \n",
       "2174  ...                        0.0                        0.0   \n",
       "\n",
       "     AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT  \\\n",
       "6444                       0.0                       1.0   \n",
       "3586                       0.0                       0.0   \n",
       "9349                       0.0                       0.0   \n",
       "7734                       NaN                       NaN   \n",
       "2174                       0.0                       0.0   \n",
       "\n",
       "     AMT_REQ_CREDIT_BUREAU_YEAR  BIRTH_DATE    EMP_DATE  constant  allnan  \\\n",
       "6444                        0.0  1985-06-28  2012-06-21         1     NaN   \n",
       "3586                        3.0  1953-12-27  2018-01-01         1     NaN   \n",
       "9349                        2.0  1975-06-21  2016-06-17         1     NaN   \n",
       "7734                        NaN  1988-04-27  2009-06-05         1     NaN   \n",
       "2174                        2.0  1975-06-13  1997-01-22         1     NaN   \n",
       "\n",
       "      report_dt  \n",
       "6444 2018-01-01  \n",
       "3586 2018-01-01  \n",
       "9349 2018-01-01  \n",
       "7734 2018-01-01  \n",
       "2174 2018-01-01  \n",
       "\n",
       "[5 rows x 125 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 8. (Optional) Reading data from SqlDataSource"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Preparing datasets as SQLite data bases"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sqlite3 as sql\n",
    "\n",
    "for _fname in ('train.db', 'test.db'):\n",
    "    if os.path.exists(_fname):\n",
    "        os.remove(_fname)\n",
    "    \n",
    "train_db = sql.connect('train.db')\n",
    "train_data.to_sql('data', train_db)\n",
    "\n",
    "test_db = sql.connect('test.db')\n",
    "test_data.to_sql('data', test_db)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Using dataset wrapper for a connection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "from lightautoml.reader.tabular_batch_generator import SqlDataSource\n",
    "\n",
    "# train_data is replaced with a wrapper for an SQLAlchemy connection\n",
    "# Wrapper requires SQLAlchemy connection string and query to obtain data from\n",
    "train_data = SqlDataSource('sqlite:///train.db', 'select * from data', index='index')\n",
    "test_data = SqlDataSource('sqlite:///test.db', 'select * from data', index='index')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## AutoML preset usage"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 1. Create Task"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 6.11 ms, sys: 1.41 ms, total: 7.52 ms\n",
      "Wall time: 5.65 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "task = Task('binary', )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 2. Setup columns roles"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Roles setup here set target column and base date, which is used to calculate date differences:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 48 µs, sys: 32 µs, total: 80 µs\n",
      "Wall time: 95.1 µs\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "roles = {'target': TARGET_NAME,\n",
    "         DatetimeRole(base_date=True, seasonality=(), base_feats=False): 'report_dt',\n",
    "         }"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 3. Create AutoML from preset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To create AutoML model here we use `TabularAutoML` preset, which looks like:\n",
    "\n",
    "<img src=\"../../imgs/tutorial_2_pipeline.png\" alt=\"TabularAutoML preset pipeline\" style=\"width:100%;\"/>\n",
    "\n",
    "All params we set above can be send inside preset to change its configuration:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "oof_pred:\n",
      "array([[0.0226106 ],\n",
      "       [0.02359573],\n",
      "       [0.02438388],\n",
      "       ...,\n",
      "       [0.02287533],\n",
      "       [0.15669319],\n",
      "       [0.08664417]], dtype=float32)\n",
      "Shape = (8000, 1)\n",
      "CPU times: user 4min 19s, sys: 3.59 s, total: 4min 23s\n",
      "Wall time: 1min 11s\n"
     ]
    }
   ],
   "source": [
    "%%time \n",
    "\n",
    "automl = TabularAutoML(task = task, \n",
    "                       timeout = TIMEOUT,\n",
    "                       general_params = {'nested_cv': False, 'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},\n",
    "                       reader_params = {'cv': N_FOLDS, 'random_state': RANDOM_STATE},\n",
    "                       tuning_params = {'max_tuning_iter': 20, 'max_tuning_time': 30},\n",
    "                       lgb_params = {'default_params': {'num_threads': N_THREADS}})\n",
    "oof_pred = automl.fit_predict(train_data, roles = roles)\n",
    "print('oof_pred:\\n{}\\nShape = {}'.format(oof_pred, oof_pred.shape))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 4. Predict to test data and check scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Prediction for test data:\n",
      "array([[0.05828221],\n",
      "       [0.07749337],\n",
      "       [0.02520473],\n",
      "       ...,\n",
      "       [0.05070161],\n",
      "       [0.0373171 ],\n",
      "       [0.23640296]], dtype=float32)\n",
      "Shape = (2000, 1)\n",
      "Check scores...\n",
      "OOF score: 0.7500913646530726\n",
      "TEST score: 0.7331657608695653\n",
      "CPU times: user 1.05 s, sys: 4.05 ms, total: 1.06 s\n",
      "Wall time: 449 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "test_pred = automl.predict(test_data)\n",
    "print('Prediction for test data:\\n{}\\nShape = {}'\n",
    "              .format(test_pred, test_pred.shape))\n",
    "\n",
    "print('Check scores...')\n",
    "print('OOF score: {}'.format(roc_auc_score(train_data.data[TARGET_NAME].values, oof_pred.data[:, 0])))\n",
    "print('TEST score: {}'.format(roc_auc_score(test_data.data[TARGET_NAME].values, test_pred.data[:, 0])))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 5. Create AutoML with time utilization "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Below we are going to create specific AutoML preset for TIMEOUT utilization (try to spend it as much as possible):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "oof_pred:\n",
      "array([[0.0343032 ],\n",
      "       [0.01933593],\n",
      "       [0.02276292],\n",
      "       ...,\n",
      "       [0.02349434],\n",
      "       [0.17084229],\n",
      "       [0.09522362]], dtype=float32)\n",
      "Shape = (8000, 1)\n",
      "CPU times: user 16min 54s, sys: 12.3 s, total: 17min 6s\n",
      "Wall time: 4min 29s\n"
     ]
    }
   ],
   "source": [
    "%%time \n",
    "\n",
    "automl = TabularUtilizedAutoML(task = task, \n",
    "                       timeout = TIMEOUT,\n",
    "                       general_params = {'nested_cv': False, 'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},\n",
    "                       reader_params = {'cv': N_FOLDS, 'random_state': RANDOM_STATE},\n",
    "                       tuning_params = {'max_tuning_iter': 20, 'max_tuning_time': 30},\n",
    "                       lgb_params = {'default_params': {'num_threads': N_THREADS}})\n",
    "oof_pred = automl.fit_predict(train_data, roles = roles)\n",
    "print('oof_pred:\\n{}\\nShape = {}'.format(oof_pred, oof_pred.shape))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 6. Predict to test data and check scores for utilized automl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Prediction for test data:\n",
      "array([[0.05981494],\n",
      "       [0.07601136],\n",
      "       [0.02678316],\n",
      "       ...,\n",
      "       [0.04721078],\n",
      "       [0.03855655],\n",
      "       [0.19377196]], dtype=float32)\n",
      "Shape = (2000, 1)\n",
      "Check scores...\n",
      "OOF score: 0.7586795357421285\n",
      "TEST score: 0.730679347826087\n",
      "CPU times: user 2.99 s, sys: 64.1 ms, total: 3.05 s\n",
      "Wall time: 1.21 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "test_pred = automl.predict(test_data)\n",
    "print('Prediction for test data:\\n{}\\nShape = {}'\n",
    "              .format(test_pred, test_pred.shape))\n",
    "\n",
    "print('Check scores...')\n",
    "print('OOF score: {}'.format(roc_auc_score(train_data.data[TARGET_NAME].values, oof_pred.data[:, 0])))\n",
    "print('TEST score: {}'.format(roc_auc_score(test_data.data[TARGET_NAME].values, test_pred.data[:, 0])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "vscode": {
   "interpreter": {
    "hash": "77738023c946bcedeeb6a5c983bcfb7849325694ebd87acbc9267f45e9f4af48"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
